STAT 679: Problem Set #2
Q1. America Time Use Survey
Part (a)
Skim the data without visualizing it. Write three questions for follow-up analysis. Among these, at least one should compare multiple activities with one another, and at least one should compare time-points within a single activity
- Among all activities, which are the ones that people mostly engage in than any other?
- Between what times did more than 90% of the people engage in martial arts?
- Between noon, did people prefer hunting or dancing?
- What activities did a majority (>50%) of the people do between 12PM till 6PM on New Year’s Day?
Part (b)
Make a plot of prop_smooth over time for each
activity. Justify your choice of visual encoding – what questions does
it help answer efficiently?
In the below visualization, I have encoded the activities in the color encoding of the line plot.
activities <- read.csv("https://raw.githubusercontent.com/krisrs1128/stat992_f23/main/exercises/ps2/activity.csv") %>%
group_by(activity) %>%
mutate(
time = as.POSIXct(time, tryFormats="%Y-%m-%dT%H:%M:%SZ"),
ps_percentage = prop_smooth*100
)
step_val = activities$time[2] - activities$time[1]
ggplot(activities, aes(x=time)) +
geom_step(aes(y = ps_percentage)) +
geom_step(aes(y = 50)) +
geom_rect(aes(xmin = time, xmax = time + step_val, ymin = 50, ymax = ps_percentage, fill = ps_percentage - 50)) +
facet_wrap(~ reorder(activity, -prop), ncol = 1, strip.position = 'left') +
labs(title = "Activities Americans did on New Year's day",
x = 'Time', y = '', fill = 'Percentage of people') +
scale_x_datetime(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0,0.1,0)) +
scale_fill_gradient2(
low = "violetred3", high = "chartreuse4", limits = c(-50,50),
breaks = seq(-50,50,10), labels = seq(0,100,10),
guide = guide_colorbar(barwidth = 20, barheight = 1)
) +
theme(strip.text.y.left = element_text(angle=0, hjust=1), axis.text.y = element_blank())Part (c)
Create an alternative visualization using a different encoding. For example, you may (but do not have to) use a heatmap, horizon or ridge line plot. Compare the trade-offs involved between the two encodings. What questions are easier to answer using your visualization from (b), and which are easier to visualize using your visualization from (c)?
I chose to make a Horizon Plot for the given data. Fortunately the visualizations from both (b) and (c) help answer all the questions I stated in part-(a). But the horizon plot has an additional advantage of utilizing the full plot area so it is easier to interpret.
cutpoints <- seq(0,100,by=10)
ggplot(activities) +
geom_horizon(aes(time, ps_percentage, fill = after_stat(Cutpoints)), origin = 50, horizonscale = cutpoints) +
facet_wrap(~ reorder(activity, -prop), ncol = 1, strip.position = 'left') +
scale_x_datetime(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
scale_fill_hcl(palette = "PiYG") +
labs(title = "Activities Americans did on New Year's day",
x = 'Time', y = '', fill = 'Percentage of people') +
guides(fill = guide_legend(byrow = T, reverse = T))+
theme(strip.text.y.left = element_text(angle=0, hjust=1), axis.text.y = element_blank())Q2. Midwestern Power Plants
Part (a)
Create a map of power plants that shows where plants are
located, how they generate electricity (primary_fuel), and
how much generation capacity they have
(capacity_mw).
plants <- read_sf("https://raw.githubusercontent.com/krisrs1128/stat992_f23/main/exercises/ps2/power_plants.geojson")
basemap <- cc_location(loc= c(-89.63490, 42.90875), buffer = 10e5)
tm_shape(basemap) +
tm_rgb() +
tm_shape(plants) +
tm_dots(
size='capacity_mw',
col='primary_fuel',
alpha = 0.7,
palette="Set2",
title = "Fuel Type",
title.size = "Capacity (in MW)",
scale=2
) +
tm_layout(
legend.outside = T,
legend.outside.position = "right",
main.title = " Power plants in the Mid-west",
)Part (b)
update_map <- function(df, selected_) {
selected_data <- df %>% filter(selected_)
tm_shape(basemap) +
tm_rgb() +
tm_shape(selected_data) +
tm_dots(
size='capacity_mw',
col='primary_fuel',
alpha = 0.7,
palette="Set2",
title = "Fuel Type",
title.size = "Capacity (in MW)",
legend.show = F,
scale=2
) +
tm_layout(
legend.outside = T,
legend.outside.position = "left",
)
}
update_histogram <- function(df, selected_) {
selected_data <- df %>% filter(selected_)
ggplot(NULL, aes(log_capacity, col= primary_fuel, fill=primary_fuel)) +
geom_histogram(data=df, alpha = 0.3) +
geom_histogram(data=selected_data) +
scale_y_continuous(expand = c(0,0,0.1,0)) +
scale_color_brewer(palette = "Set2", guide="none") +
scale_fill_brewer(palette = "Set2") +
labs(
x = "Capacity of the Power Plant in Mega Watts",
y = "Number of Power Plants",
fill = "Type of fuel"
) +
theme(legend.position = "right")
}
ui <- fluidPage(
h3("Mid-Western Power Plants"),
fluidRow(
column(6,
h4("Brush over the plot to interact with the map"),
plotOutput("stacked_histogram", brush = brushOpts("plot_brush", direction = "x"))
),
column(6, plotOutput("map", height = 600)),
),
theme = bs_theme(bootswatch = "minty")
)
server <- function(input, output) {
selected <- reactiveVal(rep(TRUE, nrow(plants)))
observeEvent(input$plot_brush, {
selected(brushedPoints(plants, input$plot_brush, allRows = TRUE)$selected_)
})
output$stacked_histogram <- renderPlot(update_histogram(plants, selected()))
output$map <- renderPlot(update_map(plants, selected()))
}
# shinyApp(ui, server)